import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
import librosa
import seaborn as sns
import matplotlib.pyplot as plt
from librosa.display import specshow
import IPython.display as ipd
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise_distances
from scipy import signal
from scipy.stats import entropy
from numpy.linalg import norm
from scipy.spatial.distance import pdist
from scipy.stats import norm
from umap import UMAP
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
%matplotlib inline
def JSD(P, Q):
_P = P / np.linalg.norm(P, ord=1)
_Q = Q / np.linalg.norm(Q, ord=1)
_M = 0.5 * (_P + _Q)
return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
def jaccard(x, y):
return 1-np.minimum(x, y).sum()
In the previous Notebook, we tried to directly use Neural Network on the full FFT. This gaves bad result. Then we tried to slice the music is 0.5s pieces and create a Covariance Matrix for both pieces and feed this to a RNN. this gave better result but still low (45% of accuracy with only 8 classes). Now we will explore another way to classify song with more simple model.
The first thing is to extract frequency histogram for each beats. This can be used like we do with SIFT descriptor. Then we can do a clustering of those features to do after a BoW of audio features for each samples. This after can be proceed like proceed with text data.
As it is well explained on this video, there is another way to extract music features which is the Constant Q Transform. This is what we will is in correlation with beats to slice every song. First let's look at it on one example.
base_song_path = "fma_small/000/000005.mp3"
y, sr = librosa.load(base_song_path, sr=None, mono = True)
if sr != 44100:
y = librosa.resample(y, sr, 44100)
sr = 44100
D = librosa.stft(y,
n_fft = 1024,
hop_length = 1024, # hop_length = 20 ms
win_length = 1024,
window = signal.tukey(1024),
)
tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=1024)
beat_times = librosa.frames_to_time(beats, sr=sr, hop_length=1024)
cqt = np.abs(librosa.cqt(y, sr=sr, hop_length=1024))
subseg = librosa.segment.subsegment(cqt, beats, n_segments=1)
subseg_t = librosa.frames_to_time(subseg, sr=sr, hop_length=1024)
plt.figure(figsize=(20,12))
specshow(librosa.amplitude_to_db(np.abs(D)), y_axis='hz', x_axis='time')
lims = plt.gca().get_ylim()
plt.vlines(beat_times, lims[0], lims[1], color='lime', alpha=0.9, linewidth=2, label='Beats')
plt.title('FFT + Beat and sub-beat markers', fontsize=25)
plt.show()
plt.figure(figsize=(20,12))
specshow(librosa.amplitude_to_db(cqt, ref=np.max), y_axis='cqt_hz', x_axis='time')
lims = plt.gca().get_ylim()
plt.vlines(beat_times, lims[0], lims[1], color='lime', alpha=0.9, linewidth=2, label='Beats')
# plt.vlines(subseg_t, lims[0], lims[1], color='linen', linestyle='--',linewidth=1.5, alpha=0.5, label='Sub-beats')
plt.legend(frameon=True, shadow=True)
plt.title('CQT + Beat and sub-beat markers', fontsize=25)
plt.show()
Now we can take manually 2 similar slices and listen to them or show a zoom of their spectrum
step_1 = 4
step_2 = 12
start = int(subseg_t[step_1]*sr)
stop = int(subseg_t[step_1+1]*sr)
ipd.Audio(y[start:stop], rate=sr)
start = int(subseg_t[step_2]*sr)
stop = int(subseg_t[step_2+1]*sr)
ipd.Audio(y[start:stop], rate=sr)
a = cqt[:, subseg[step_1]:subseg[step_1+1]]
b = cqt[:, subseg[step_2]:subseg[step_2+1]]
plt.figure(figsize=(20,12))
plt.subplot(1, 2, 1)
specshow(librosa.amplitude_to_db(a, ref=np.max))
plt.subplot(1, 2, 2)
specshow(librosa.amplitude_to_db(b, ref=np.max))
plt.show()
Of course the sound is very similar as it is the same song. What we can finally do it to reduce this slice to 1 vector of the means along time. Then we can compute the Jenson Shannon Divergence of the Jaccard distance.
a = a.mean(axis=1)
a_unite = a/a.sum()
b = b.mean(axis=1)
b_unite = b / b.sum()
jsd = JSD(a_unite, b_unite)
jac = jaccard(a_unite, b_unite)
print("Jenson Shannon Divergence =", jsd)
print("Jaccard Distance =", jac)
plt.figure(figsize=(20,12))
plt.plot(a_unite);
plt.plot(b_unite);
plt.title("Comparaison of both average spectrum", fontsize=25)
plt.xlim(0, 84)
plt.show()
def extract_features(audio_path):
y, sr = librosa.load(audio_path, sr=None, mono = True)
id_ = os.path.basename(audio_path)[:-4]
if sr != 44100:
y = librosa.resample(y, sr, 44100)
sr = 44100
result = []
key = []
try:
tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=1024)
cqt = np.abs(librosa.cqt(y, sr=sr, hop_length=1024))
subseg = librosa.segment.subsegment(cqt, beats, n_segments=1)
for i in range(len(subseg)-1):
portion = cqt[:, subseg[i]:subseg[i+1]]
avg_portion = portion.mean(axis=1) # will be normlized into JSD
result.append(avg_portion)
key.append(id_)
except:
print("error on : ", id_)
return key, result
jump_to = 1000 # top be used to restart at a certain point as we save every 100 sample
X1 = []
X2 = []
for i, audio_path in enumerate(glob.glob("fma_small/*/*.mp3")):
if i <= jump_to:
continue
key, vector = extract_features(audio_path)
X1 += key
X2 += vector
if i > 0 and i % 100 == 0:
df = pd.DataFrame(X2)
df["song"] = X1
df = df.set_index("song")
df.to_csv("preprocessed_audio/mean_cqt/save_{}.csv".format(i//100))
X1 = []
X2 = []
df = pd.DataFrame(X2)
df["song"] = X1
df = df.set_index("song")
df.to_csv("preprocessed_audio/mean_cqt/final.csv")
During this phase we got issues with only 3 samples (026600, 114069, 143095). This will not impact too much the model so we can continue as it is. Now let's merge our 80 datasets to a global one.
container = []
for file in glob.glob("preprocessed_audio/mean_cqt/*.csv"):
container.append(pd.read_csv(file, index_col=0))
pd.concat(container, axis=0).to_csv("preprocessed_audio/mean_cqt.csv")
At this stage we have 456287 vector of the average of subsamples. The idea now is to reduce this 84 diemnsion vector to a cluster number. This will allow us to create a Bag Of Words of features existing in the song. the benefit is to be able to treat any length song, it's quite light as feature extraction as a song will be repreesented as a simple sparse vector.
df = pd.read_csv("preprocessed_audio/mean_cqt.csv", index_col=0)
df.sort_index().head()
To do some plot, the best is to have a tidy version. Let's do it just for this
df_tidy = pd.melt(df.reset_index(), id_vars=["song"], value_vars=list(map(str, range(84))))
df_tidy.variable = df_tidy.variable.astype(int)
plt.figure(figsize=(20,12))
sns.boxplot(x="variable", y="value", data=df_tidy, palette="PRGn")
plt.show()
corr = df.corr()
plt.figure(figsize=(14, 12))
sns.heatmap(corr)
plt.plot()
We can see correlation between frequence which are multiple each other with a smaller value as far as we are from the beginning.
Now we are facing an issue. Vecotrs we have are a distribution based on the frequence. The best distance tu use in that case is Jenson Shannon Divergence of Jaccard. But all clustering algorithm using custom distances require the pairwise distance matrix which would be in our case a 456287 x 456287 float32 matrix which require around 800Gb of Ram... I also tried to implement a kind of DBScan doing online learning but it doesn't work as expected. As a result we will have to stick to the MiniBatch Kmeans which uses Euclidean Distance.
df = pd.read_csv("preprocessed_audio/mean_cqt.csv", index_col=0)
df = df.div(df.sum(axis=1)+1e-6, axis=0)
df = df.dropna(axis=0, how='any')
The idea first was to find threshold with the Jaccard Distance (faster to compute). This may be use for the clustering as a max radius of observation.
X = df.as_matrix()
a = X[0, :]
b = X[1, :]
c = np.apply_along_axis(jaccard, axis=1, arr=X, y=a)
d = np.apply_along_axis(jaccard, axis=1, arr=X, y=b)
x = np.linspace(0, 1, 100)
y1 = norm.pdf(x, np.mean(c), np.std(c))
y2 = norm.pdf(x, np.mean(d), np.std(d))
plt.figure(figsize=(20, 12))
plt.hist(c, bins=200, normed=True, label="First Vector")
plt.hist(d, bins=200, normed=True, label="Second Vector")
plt.plot(x, y1, label="Normal law with distribution params")
plt.plot(x, y2, label="Normal law with distribution params")
plt.legend()
plt.title("Histogram of distances of all vector to one vector")
plt.show()
mean = np.mean(c)
std = np.std(c)
norm.cdf(mean - 3.3*std, mean, std)
0.04 % of observations are below 3.3 x stdev comapre to the second vector (but true in general is the repartition follow normal law). This can be a good threshold as we may ends up with few thousands of clusters which is correct. Let's plot an image of clustered vectors
temp1 = X[np.where(c < np.mean(c) - 3.5 * np.std(c))]
temp2 = X[np.where(d < np.mean(d) - 3.5 * np.std(d))]
plt.figure(figsize=(12, 12))
plt.subplot(1, 2, 1)
plt.imshow(temp1, extent=[0, temp1.shape[0], 0, temp1.shape[1]], aspect='auto');
plt.title(temp1.shape)
plt.subplot(1, 2, 2)
plt.imshow(temp2, extent=[0, temp2.shape[0], 0, temp2.shape[1]], aspect='auto');
plt.title(temp2.shape)
plt.show()
It's correct but clusters are still not perfect we have still important differences
df = pd.read_csv("preprocessed_audio/mean_cqt.csv", index_col=0)
df = df.div(df.sum(axis=1)+1e-6, axis=0)
df = df.dropna(axis=0, how='any')
X = df.as_matrix()
X_init = df.as_matrix()
index_list = np.arange(X.shape[0])
cluster_list = np.zeros(X.shape[0])
cluster_num = 1
while X.shape[0] > 0:
print("Cluster : {} - Remaining observations : {}".format(cluster_num, X.shape[0]))
a = X[0 , :]
b = np.apply_along_axis(jaccard, axis=1, arr=X, y=a)
mean = np.mean(b)
std = np.std(b)
threshold = max(mean - 3 * std, 0.01)
index_below_threshold = np.where(b < threshold)
index_above_threshold = np.where(b >= threshold)
index_cluster = index_list[index_below_threshold] # store index of similarity distribution
cluster_list[index_cluster] = cluster_num
index_list = index_list[index_above_threshold]
X = X[index_above_threshold]
cluster_num += 1
This implementation is bad, we have very fast a lot of clusters > 10k and the algorithm is very very slow... few hours for this so I stopped it. We can still after training plot some clustersas we did before
temp1 = X_init[np.where(cluster_list==2)]
temp2 = X_init[np.where(cluster_list==3)]
plt.figure(figsize=(12, 12))
plt.subplot(1, 2, 1)
plt.imshow(temp1, extent=[0, temp1.shape[0], 0, temp1.shape[1]], aspect='auto');
plt.title(temp1.shape)
plt.subplot(1, 2, 2)
plt.imshow(temp2, extent=[0, temp2.shape[0], 0, temp2.shape[1]], aspect='auto');
plt.title(temp2.shape)
plt.show()
As we mentionned previously, we can use Mini Batch Kmeans but we have to ensure that our clusters are well balanced in term of qty and homogeneous.
df = pd.read_csv("preprocessed_audio/mean_cqt.csv", index_col=0)
df = df.div(df.sum(axis=1)+1e-6, axis=0)
df = df.dropna(axis=0, how='any')
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=1000,
init='k-means++',
max_iter=30,
batch_size=1000,
verbose=0,
compute_labels=True,
random_state=None
)
clusters = kmeans.fit_predict(df)
Now we started with 1000 clusters. Let's look at the content of 2 clusters
X = df.as_matrix()
temp1 = X[np.where(clusters==999)]
temp2 = X[np.where(clusters==998)]
plt.figure(figsize=(12, 12))
plt.subplot(1, 2, 1)
plt.imshow(temp1, extent=[0, temp1.shape[0], 0, temp1.shape[1]], aspect='auto');
plt.title(temp1.shape)
plt.subplot(1, 2, 2)
plt.imshow(temp2, extent=[0, temp2.shape[0], 0, temp2.shape[1]], aspect='auto');
plt.title(temp2.shape)
plt.show()
One is quite clean but the other one is very noisy and will lead to bad predictions. We can also look at the number of item per cluster and the histogram of this balance
plt.figure(figsize=(20, 12))
a = plt.hist(clusters, bins=999)
plt.title("Number of items per clusters")
plt.show()
We can see that values goes from only few to more than 2000k. Let's see the balance.
plt.figure(figsize=(20, 12))
plt.hist(a[0], bins=100)
plt.title("Blance of the number of item per cluster")
plt.show()
We can see that we have very big and very small clusters. Maybe we can try another number of clusters. The objective is not to have everytime the same because there is sound we won't find everywhere. But increase this number may improve clustering too. let's try 1500.
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=1500,
init='k-means++',
max_iter=30,
batch_size=300,
verbose=0,
compute_labels=True,
random_state=None
)
clusters = kmeans.fit_predict(df)
X = df.as_matrix()
temp1 = X[np.where(clusters==999)]
temp2 = X[np.where(clusters==998)]
plt.figure(figsize=(12, 12))
plt.subplot(1, 2, 1)
plt.imshow(temp1, extent=[0, temp1.shape[0], 0, temp1.shape[1]], aspect='auto');
plt.title(temp1.shape)
plt.subplot(1, 2, 2)
plt.imshow(temp2, extent=[0, temp2.shape[0], 0, temp2.shape[1]], aspect='auto');
plt.title(temp2.shape)
plt.show()
Still lot of noise...
plt.figure(figsize=(20, 12))
plt.subplot(2, 1, 1)
a = plt.hist(clusters, bins=999)
plt.subplot(2, 1, 2)
plt.hist(a[0], bins=100)
plt.show()
So let's train back the model with 1k clusters
kmeans = MiniBatchKMeans(n_clusters=1000,
init='k-means++',
max_iter=30,
batch_size=1000,
verbose=0,
compute_labels=True,
random_state=None
)
clusters = kmeans.fit_predict(df)
from sklearn.externals import joblib
joblib.dump(kmeans, 'kmeans.pkl')
Last thing we can do it to check the Jenson Shannon divergence per clusters. This can be done by taking all pairwise distances for every items of every clusters.
dist_vector = pdist(X[np.where(clusters==0)], metric=JSD)
dist_vector2 = pdist(X[np.where(clusters==150)], metric=JSD)
plt.figure(figsize=(20, 12))
plt.hist(dist_vector, bins=200)
plt.hist(dist_vector2, bins=200)
plt.show()
We can see that some cluster are correct(blue one) with a mean of JSD of 0.05 but the case of the cluster 2, the value is 50ù more important... This is where we could improve the clustering but I don't know yet how :(
df["clusters"] = clusters
df[["clusters"]].to_csv("preprocessed_audio/clusters.csv")
df = pd.read_csv("preprocessed_audio/clusters.csv", index_col=0)
ohe = pd.get_dummies(df, columns = ["clusters"])
ohe = ohe.groupby(ohe.index).sum()
ohe.to_csv("preprocessed_audio/sample_content.csv")
ohe[ohe["clusters_10"] > 0].head()
Now let's try to fin d a similar audio to a given audio
df = pd.read_csv("preprocessed_audio/sample_content.csv", index_col=0)
genres = pd.read_csv("preprocessed_meta/genres.csv", index_col=0)
tracks = pd.read_csv("preprocessed_meta/tracks.csv", index_col=0,
encoding="iso-8859-1",
header=[0, 1],
skipinitialspace=True)
df = df.join(tracks[("track", "genre_top")])
df.columns = list(df.columns[:-1]) + ["genres"]
df.head()
We have a BoW. In theroy the best would be to use cosine similarity but this doesn't work. As a result, we will use the cityblock distance on normalised BoW.
X = df.drop("genres", axis=1)
X2 = X.div(X.sum(axis=1)+1e-6, axis=0)
distance_matrix = pairwise_distances(X2, X2, metric='cityblock', n_jobs=-1)
Let's see distances between 1 sample song to the all other one.
plt.figure(figsize=(20, 12))
plt.hist(distance_matrix[0], bins = 50)
plt.show()
Most of song doesn't share any values so we have a distance of 2. Let's look only to the closest one.
index_1 = 2
index_2 = np.argsort(distance_matrix[index_1, :])[1]
song_1 = df.iloc[[index_1]].index.values[0]
song_2 = df.iloc[[index_2]].index.values[0]
distance_matrix[index_1, index_2]
Minimum distance is 0.88... not very good but listen now the 2 songs
song_id = "{0:06d}".format(song_1)
folder = song_id[:3]
y, sr = librosa.load("fma_small/"+folder+"/"+song_id+".mp3", sr=None, mono = True)
if sr != 44100:
y = librosa.resample(y, sr, 44100)
sr = 44100
print(song_id)
ipd.Audio(y[:1000000], rate=sr)
song_id = "{0:06d}".format(song_2)
folder = song_id[:3]
y, sr = librosa.load("fma_small/"+folder+"/"+song_id+".mp3", sr=None, mono = True)
if sr != 44100:
y = librosa.resample(y, sr, 44100)
sr = 44100
print(song_id)
ipd.Audio(y[:1000000], rate=sr)
Hum... we can hear some similar sound but the global song is quite far from here. maybe more song and a smaller decomposition of every song would help to solve this defect.
df = pd.read_csv("preprocessed_audio/sample_content.csv", index_col=0)
tracks = pd.read_csv("preprocessed_meta/tracks.csv", index_col=0,
encoding="iso-8859-1",
header=[0, 1],
skipinitialspace=True)
df = df.join(tracks[("track", "genre_top")])
df.columns = list(df.columns[:-1]) + ["genres"]
X = df.drop("genres", axis=1).as_matrix()
enc = LabelEncoder()
y_enc = enc.fit_transform(df.genres.values)
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.20, random_state=42)
X_train.dtype
First I wanted to try dimensionnality reduction using PCA of LDA/LSA without good result. So to try classifier, let's first compute the TF-IDF instead of only tf and try few models on this matrix.
clf = SVC()
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 50, max_depth=8)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
Surprisingly, we have a very sparse matrix of 8000 x 1000 and we are still able to not overfit too much. This is a good point.
Compare to the previous Notebbok, this one is more focused on classical approach by clustering music like we do with text. Then we can apply apply all methods from text mining instead of immage processing. It has very good benefits. First we can do online learning as we can process all new song in a very simple ways. We can also understand what a specific user like of don't like by applying reinforcement method (if they swap the song, we can remove points for every precedent bucket with a discount factor).
Result of classification are lower than the previous method with around 35% of correct result. This hasn't been digged as we knew upfront that our model has defect. The main one is the clustering. I don't know a good way to perform the clustering on such a huge dataset with custom distance. In the next workbook, we will use another approach to segment the audio by using Semi Hidden Markov Models.